library(tidyverse)
library(plotly)
library(sf)
library(mapview)
library(tigris)
library(censusapi)
library(leaflet)
library(lehdr)
library(usmap)
library(lmtest)
library(pracma)
library(lmtest)
library(forecast)
library(vars)
library(rvest)
library(RSelenium)
library(seleniumPipes)
library(dLagM)
library(jsonlite)
library(rgdal)
library(esri2sf)
library(readr)
library(tis)
options(
tigris_class = "sf",
tigris_use_cache = TRUE
)
Sys.setenv(CENSUS_KEY="10dcd73d7c043e91bac9fb8d3989cbff54b08790")
# SET your path here to the covid19analysis folder.
sg_path <- '/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/'
sg_hourly_path <- '/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/weekly-patterns/v2/hourly/'
Load SF and Alameda case data and Alameda visits data.
# block groups
sf_blockgroups <-
block_groups("CA","San Francisco", cb=F, progress_bar=F) %>%
st_transform('+proj=longlat +datum=WGS84')
ac_blockgroups <-
block_groups("CA","Alameda", cb=F, progress_bar=F) %>%
st_transform('+proj=longlat +datum=WGS84')
sf_bgs <- sf_blockgroups %>% pull(GEOID)
ac_bgs <- ac_blockgroups %>% pull(GEOID)
# zip code areas
zctas_94 <-
zctas(cb = F, starts_with = "94") %>%
st_transform('+proj=longlat +datum=WGS84') %>%
dplyr::select(ZCTA5CE10, geometry)
zctas_95 <-
zctas(cb = F, starts_with = "95") %>%
st_transform('+proj=longlat +datum=WGS84') %>%
dplyr::select(ZCTA5CE10, geometry)
zctas_94_95 <- rbind(zctas_94, zctas_95)
# join with block groups
sf_bg_zctas <- sf_blockgroups %>%
st_centroid() %>%
st_join(zctas_94_95) %>%
dplyr::select(GEOID, ZCTA5CE10) %>%
rename(blockgroup = GEOID, zipcode = ZCTA5CE10)
ac_bg_zctas <- ac_blockgroups %>%
st_centroid() %>%
st_join(zctas_94_95) %>%
dplyr::select(GEOID, ZCTA5CE10) %>%
rename(blockgroup = GEOID, zipcode = ZCTA5CE10)
# get SF case data
sf_place_cases <-
read_csv("https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/latimes-place-totals.csv") %>%
filter(county == 'San Francisco')
# get population data for San Francisco
acs_vars = readRDS("/Users/simonespeizer/CEE 218Z/censusData2018_acs_acs5.rds")
# define a function for pulling census data
pullCensus <- function(variableToPull, county) {
regionString <- paste0("state:06+county:", county)
censusData <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = regionString,
vars = variableToPull
) %>%
mutate(blockgroup = paste0(state,county,tract,block_group)) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME"))
return(censusData)
}
sf_fips <- fips("CA", "San Francisco") %>% substr(3,5)
# getting population by zip code
sf_pop_zip <- pullCensus("B01003_001E", sf_fips) %>%
left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>%
group_by(zipcode) %>%
summarize(total_pop_zip = sum(B01003_001E))
# join with cases and calculate cases per person
sf_cases_zip <- sf_place_cases %>% left_join(sf_pop_zip, by = c("place" = "zipcode")) %>%
mutate(cases_by_pop = confirmed_cases / total_pop_zip) %>%
rename(zipcode = place)
# get Alameda County case data - downloaded manually
ac_place_cases <- read.csv("Simone_Speizer/Alameda_County_Cumulative_Cases_By_Place_And_Zip.csv")
# handle the ones that are reported as <10 by calling them 5 cases. Note this is a simplification/choice I made and could be changed.
ac_cases_zip <- ac_place_cases %>%
rename(date = DtCreate) %>%
mutate(date = date %>% substr(1,10) %>% as.Date()) %>%
dplyr::select(c(date, contains("F9"))) %>% # only select zip code data
gather(key = "zipcode", value = "cases", -date) %>%
mutate(cases = ifelse(cases == "<10", "5", cases),
zipcode = zipcode %>% substr(2,6)) %>% # replace cases <10 with 10 and remove the "F" from zipcode names
mutate(cases = as.numeric(cases)) %>%
arrange(date)
# get Alameda County populations by zip code
ac_fips <- fips("CA", "Alameda") %>% substr(3,5)
# calculate population by zip code
ac_pop_zip <- pullCensus("B01003_001E", ac_fips) %>%
left_join(ac_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>%
group_by(zipcode) %>%
summarize(total_pop_zip = sum(B01003_001E))
# join with cases
ac_cases_zip <- ac_cases_zip %>% left_join(ac_pop_zip) %>%
mutate(cases_by_pop = cases / total_pop_zip)
ac_daily_visits_zip_1 <- readRDS(paste0(sg_path, "weekly-patterns/v2/ac_zip_visits_daily_sum_hourly_03-09-20_05-24-20.rds"))
ac_daily_visits_zip_2 <- readRDS(paste0(sg_path, "weekly-patterns/v2/ac_zip_visits_daily_sum_hourly_05-25-20_05-31-20.rds"))
ac_daily_visits_zip <- rbind(ac_daily_visits_zip_1, ac_daily_visits_zip_2)
# calculate cumulative visits over time
ac_cum_visits_zip <- ac_daily_visits_zip %>%
mutate(total_visits_avg = replace_na(total_visits_avg, 0)) %>% # replace NAs with zero so they don't lead to NAs all following that date
group_by(zipcode) %>%
mutate(cumulative_visits = cumsum(total_visits_avg)) %>%
left_join(ac_pop_zip) %>%
mutate(cum_visits_per_capita = cumulative_visits / total_pop_zip)
Testing out using past data to predict future data, for a particular zip code. I use 94606.
zip_select <- "94606"
ac_cases_zip_select <- ac_cases_zip %>% filter(zipcode == zip_select)
ac_cum_visits_zip_select <- ac_cum_visits_zip %>% filter(zipcode == zip_select)
ac_cum_visits_zip_select %>% plot_ly() %>%
add_trace(x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers') %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative visits per person'))
ac_cases_zip_select %>% plot_ly() %>%
add_trace(x = ~date, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative cases per person'))
For this zip code, it looks like there is a change in case growth rate after May 1st relative to before May 1st. I will assume a lag of 14 days on the visits data, and look at the slope of the cumulative visits data from April 3rd to April 16th, and April 17th through May 1st.
change_date <- as.Date("2020-05-01")
lag <- 14
date_range_width <- 14
start_date <- change_date - lag - date_range_width
break_date <- change_date - lag
end_date <- change_date - lag + date_range_width
ac_cum_visits_zip_select_1 <- ac_cum_visits_zip_select %>% filter(date < break_date & date >= start_date)
ac_cum_visits_zip_select_2 <- ac_cum_visits_zip_select %>% filter(date >= break_date & date <= end_date)
plot_ly() %>%
add_trace(data = ac_cum_visits_zip_select_1, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 1") %>%
add_trace(data = ac_cum_visits_zip_select_2, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 2") %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative visits per person'))
model_1 <- lm(cum_visits_per_capita ~ date, ac_cum_visits_zip_select_1)
summary(model_1)
##
## Call:
## lm(formula = cum_visits_per_capita ~ date, data = ac_cum_visits_zip_select_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.186179 -0.043693 -0.008393 0.070765 0.184887
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.088e+04 1.192e+02 -91.23 <2e-16 ***
## date 5.937e-01 6.494e-03 91.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09795 on 12 degrees of freedom
## Multiple R-squared: 0.9986, Adjusted R-squared: 0.9984
## F-statistic: 8359 on 1 and 12 DF, p-value: < 2.2e-16
model_2 <- lm(cum_visits_per_capita ~ date, ac_cum_visits_zip_select_2)
summary(model_2)
##
## Call:
## lm(formula = cum_visits_per_capita ~ date, data = ac_cum_visits_zip_select_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.074767 -0.030712 -0.002396 0.019025 0.099361
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.225e+04 4.979e+01 -246.0 <2e-16 ***
## date 6.685e-01 2.710e-03 246.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04534 on 13 degrees of freedom
## Multiple R-squared: 0.9998, Adjusted R-squared: 0.9998
## F-statistic: 6.087e+04 on 1 and 13 DF, p-value: < 2.2e-16
plot_ly() %>%
add_trace(data = ac_cum_visits_zip_select_1, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 1") %>%
add_trace(data = ac_cum_visits_zip_select_2, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 2") %>%
add_trace(data = ac_cum_visits_zip_select_1, x = ~date, y = fitted(model_1), mode = 'lines', name = "model 1", text = paste("slope:", summary.lm(model_1)$coef[2])) %>%
add_trace(data = ac_cum_visits_zip_select_2, x = ~date, y = fitted(model_2), mode = 'lines', name = "model 2", text = paste("slope:", summary.lm(model_2)$coef[2])) %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative visits per person'))
There is a difference, not huge, but it does exist.
Try this again but using more data (3 week range instead of 2)
change_date <- as.Date("2020-05-01")
lag <- 14
date_range_width <- 21
start_date <- change_date - lag - date_range_width
break_date <- change_date - lag
end_date <- change_date - lag + date_range_width
ac_cum_visits_zip_select_1 <- ac_cum_visits_zip_select %>% filter(date < break_date & date >= start_date)
ac_cum_visits_zip_select_2 <- ac_cum_visits_zip_select %>% filter(date >= break_date & date <= end_date)
plot_ly() %>%
add_trace(data = ac_cum_visits_zip_select_1, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 1") %>%
add_trace(data = ac_cum_visits_zip_select_2, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 2") %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative visits per person'))
model_1 <- lm(cum_visits_per_capita ~ date, ac_cum_visits_zip_select_1)
summary(model_1)
##
## Call:
## lm(formula = cum_visits_per_capita ~ date, data = ac_cum_visits_zip_select_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.24410 -0.11527 0.02113 0.07267 0.30255
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.140e+04 1.055e+02 -108.1 <2e-16 ***
## date 6.224e-01 5.748e-03 108.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1595 on 19 degrees of freedom
## Multiple R-squared: 0.9984, Adjusted R-squared: 0.9983
## F-statistic: 1.172e+04 on 1 and 19 DF, p-value: < 2.2e-16
model_2 <- lm(cum_visits_per_capita ~ date, ac_cum_visits_zip_select_2)
summary(model_2)
##
## Call:
## lm(formula = cum_visits_per_capita ~ date, data = ac_cum_visits_zip_select_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.06848 -0.03712 -0.01818 0.02374 0.15295
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.217e+04 3.563e+01 -341.6 <2e-16 ***
## date 6.642e-01 1.939e-03 342.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05769 on 20 degrees of freedom
## Multiple R-squared: 0.9998, Adjusted R-squared: 0.9998
## F-statistic: 1.174e+05 on 1 and 20 DF, p-value: < 2.2e-16
plot_ly() %>%
add_trace(data = ac_cum_visits_zip_select_1, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 1") %>%
add_trace(data = ac_cum_visits_zip_select_2, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 2") %>%
add_trace(data = ac_cum_visits_zip_select_1, x = ~date, y = fitted(model_1), mode = 'lines', name = "model 1", text = paste("slope:", summary.lm(model_1)$coef[2])) %>%
add_trace(data = ac_cum_visits_zip_select_2, x = ~date, y = fitted(model_2), mode = 'lines', name = "model 2", text = paste("slope:", summary.lm(model_2)$coef[2])) %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative visits per person'))
Try with a different zip code.
# make this into a function
zipSelectFindSlopes <- function(zip_select, lag, change_date, date_range_width, ac_cum_visits_zip_select) {
start_date <- change_date - lag - date_range_width
break_date <- change_date - lag
end_date <- change_date - lag + date_range_width
ac_cum_visits_zip_select_1 <- ac_cum_visits_zip_select %>% filter(date < break_date & date >= start_date)
ac_cum_visits_zip_select_2 <- ac_cum_visits_zip_select %>% filter(date >= break_date & date <= end_date)
plot_ly() %>%
add_trace(data = ac_cum_visits_zip_select_1, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 1") %>%
add_trace(data = ac_cum_visits_zip_select_2, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 2") %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative visits per person'))
model_1 <- lm(cum_visits_per_capita ~ date, ac_cum_visits_zip_select_1)
print(summary(model_1))
model_2 <- lm(cum_visits_per_capita ~ date, ac_cum_visits_zip_select_2)
print(summary(model_2))
plot_ly() %>%
add_trace(data = ac_cum_visits_zip_select_1, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 1") %>%
add_trace(data = ac_cum_visits_zip_select_2, x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers', name = "time 2") %>%
add_trace(data = ac_cum_visits_zip_select_1, x = ~date, y = fitted(model_1), mode = 'lines', name = "model 1", text = paste("slope:", summary.lm(model_1)$coef[2])) %>%
add_trace(data = ac_cum_visits_zip_select_2, x = ~date, y = fitted(model_2), mode = 'lines', name = "model 2", text = paste("slope:", summary.lm(model_2)$coef[2])) %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative visits per person'))
}
# select a new zip code
zip_select <- "94603"
ac_cases_zip_select <- ac_cases_zip %>% filter(zipcode == zip_select)
ac_cum_visits_zip_select <- ac_cum_visits_zip %>% filter(zipcode == zip_select)
ac_cum_visits_zip_select %>% plot_ly() %>%
add_trace(x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers') %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative visits per person'))
ac_cases_zip_select %>% plot_ly() %>%
add_trace(x = ~date, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative cases per person'))
This zip code seems to have a change in slope around May 10th. Again using 14 day lag.
change_date <- as.Date("2020-05-10")
lag <- 14
date_range_width <- 14
zipSelectFindSlopes(zip_select, lag, change_date, date_range_width, ac_cum_visits_zip_select)
##
## Call:
## lm(formula = cum_visits_per_capita ~ date, data = ac_cum_visits_zip_select_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.11836 -0.07764 -0.02610 0.04741 0.19391
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.307e+04 1.263e+02 -103.5 <2e-16 ***
## date 7.132e-01 6.873e-03 103.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1037 on 12 degrees of freedom
## Multiple R-squared: 0.9989, Adjusted R-squared: 0.9988
## F-statistic: 1.077e+04 on 1 and 12 DF, p-value: < 2.2e-16
##
##
## Call:
## lm(formula = cum_visits_per_capita ~ date, data = ac_cum_visits_zip_select_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.128067 -0.034061 0.001394 0.034022 0.165544
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.195e+04 9.231e+01 -129.5 <2e-16 ***
## date 6.523e-01 5.021e-03 129.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08401 on 13 degrees of freedom
## Multiple R-squared: 0.9992, Adjusted R-squared: 0.9992
## F-statistic: 1.688e+04 on 1 and 13 DF, p-value: < 2.2e-16
Hmm, it’s the reverse of what we would expect for that one. Maybe a larger lag is necessary?
change_date <- as.Date("2020-05-10")
lag <- 20
date_range_width <- 14
zipSelectFindSlopes(zip_select, lag, change_date, date_range_width, ac_cum_visits_zip_select)
##
## Call:
## lm(formula = cum_visits_per_capita ~ date, data = ac_cum_visits_zip_select_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.25383 -0.05346 0.04665 0.09272 0.17186
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.272e+04 1.702e+02 -74.77 <2e-16 ***
## date 6.943e-01 9.266e-03 74.93 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1398 on 12 degrees of freedom
## Multiple R-squared: 0.9979, Adjusted R-squared: 0.9977
## F-statistic: 5614 on 1 and 12 DF, p-value: < 2.2e-16
##
##
## Call:
## lm(formula = cum_visits_per_capita ~ date, data = ac_cum_visits_zip_select_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.12053 -0.02938 -0.01381 0.03765 0.09339
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.241e+04 6.546e+01 -189.5 <2e-16 ***
## date 6.770e-01 3.562e-03 190.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0596 on 13 degrees of freedom
## Multiple R-squared: 0.9996, Adjusted R-squared: 0.9996
## F-statistic: 3.614e+04 on 1 and 13 DF, p-value: < 2.2e-16
Not quite right for this one still.
Try 94601.
zip_select <- "94560"
ac_cases_zip_select <- ac_cases_zip %>% filter(zipcode == zip_select)
ac_cum_visits_zip_select <- ac_cum_visits_zip %>% filter(zipcode == zip_select)
ac_cum_visits_zip_select %>% plot_ly() %>%
add_trace(x = ~date, y = ~cum_visits_per_capita, type = 'scatter', mode = 'markers') %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative visits per person'))
ac_cases_zip_select %>% plot_ly() %>%
add_trace(x = ~date, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Date'), yaxis = list(title = 'Cumulative cases per person'))
This one looks like it has picked up as of May 20th.
change_date <- as.Date("2020-05-20")
lag <- 14
date_range_width <- 14
zipSelectFindSlopes(zip_select, lag, change_date, date_range_width, ac_cum_visits_zip_select)
##
## Call:
## lm(formula = cum_visits_per_capita ~ date, data = ac_cum_visits_zip_select_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.05385 -0.03211 -0.01497 0.03954 0.09100
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.314e+04 5.725e+01 -229.5 <2e-16 ***
## date 7.169e-01 3.115e-03 230.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04698 on 12 degrees of freedom
## Multiple R-squared: 0.9998, Adjusted R-squared: 0.9998
## F-statistic: 5.296e+04 on 1 and 12 DF, p-value: < 2.2e-16
##
##
## Call:
## lm(formula = cum_visits_per_capita ~ date, data = ac_cum_visits_zip_select_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.079267 -0.040764 -0.001597 0.028882 0.114863
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.321e+04 6.555e+01 -201.5 <2e-16 ***
## date 7.208e-01 3.564e-03 202.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05963 on 13 degrees of freedom
## Multiple R-squared: 0.9997, Adjusted R-squared: 0.9997
## F-statistic: 4.091e+04 on 1 and 13 DF, p-value: < 2.2e-16
A slight increase after the the shift point.
Considering a single zip code and looking at a trendline of past cases and visits, with a lag of 14 days, and seeing if this trend line fits the more recent data.
I will use zip code 94606 again.
zip_select <- "94606"
lag <- 14
ac_cases_zip_select <- ac_cases_zip %>% filter(zipcode == zip_select) %>%
mutate(change_cases_by_pop = c(NA, diff(cases_by_pop)))
ac_daily_visits_zip_select <- ac_daily_visits_zip %>% filter(zipcode == zip_select)
ac_daily_visits_lag <- ac_daily_visits_zip_select %>%
ungroup() %>%
dplyr::select(date, total_visits_avg) %>%
mutate(lag_date = date + lag) %>%
dplyr::select(lag_date, total_visits_avg)
ac_visits_cases_select <- left_join(ac_cases_zip_select, ac_daily_visits_lag, by = c("date" = "lag_date")) %>%
filter(cases >= 10) %>% # only select once cases have started to go above the threshold
mutate(visits_per_capita = total_visits_avg / total_pop_zip)
ac_visits_cases_select %>% plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily new cases per person'))
Try separating this out into all but the most recent week of data, then adding that in.
ac_visits_cases_select_1 <- ac_visits_cases_select %>%
filter(date < (max(date) - 7))
ac_visits_cases_select_2 <- ac_visits_cases_select %>%
filter(date >= (max(date) - 7))
model_visits_cases_1 <- lm(change_cases_by_pop ~ visits_per_capita, ac_visits_cases_select_1)
summary(model_visits_cases_1)
##
## Call:
## lm(formula = change_cases_by_pop ~ visits_per_capita, data = ac_visits_cases_select_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.062e-05 -4.655e-05 -1.731e-05 3.558e-05 2.329e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.603e-05 4.849e-05 1.156 0.253
## visits_per_capita -1.376e-05 7.315e-05 -0.188 0.851
##
## Residual standard error: 5.417e-05 on 56 degrees of freedom
## Multiple R-squared: 0.0006317, Adjusted R-squared: -0.01721
## F-statistic: 0.0354 on 1 and 56 DF, p-value: 0.8514
plot_ly() %>%
add_trace(data = ac_visits_cases_select_1, x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', name = "past data") %>%
add_trace(data = ac_visits_cases_select_1, x = ~visits_per_capita, y = fitted(model_visits_cases_1), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_visits_cases_1)$r.squared), name = "past data fit") %>%
add_trace(data = ac_visits_cases_select_2, x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', name = "most recent week") %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily new cases per person'))
Doesn’t look great. What if we remove dates on which there were zero new cases?
ac_visits_cases_select_1_no_zero <- ac_visits_cases_select_1 %>% filter(change_cases_by_pop > 0)
ac_visits_cases_select_2_no_zero <- ac_visits_cases_select_2 %>% filter(change_cases_by_pop > 0)
model_visits_cases_1_2 <- lm(change_cases_by_pop ~ visits_per_capita, ac_visits_cases_select_1_no_zero)
summary(model_visits_cases_1_2)
##
## Call:
## lm(formula = change_cases_by_pop ~ visits_per_capita, data = ac_visits_cases_select_1_no_zero)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.607e-05 -3.561e-05 -1.600e-05 1.240e-05 2.120e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.157e-04 5.624e-05 2.056 0.0473 *
## visits_per_capita -6.344e-05 8.410e-05 -0.754 0.4557
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.077e-05 on 35 degrees of freedom
## Multiple R-squared: 0.016, Adjusted R-squared: -0.01212
## F-statistic: 0.569 on 1 and 35 DF, p-value: 0.4557
plot_ly() %>%
add_trace(data = ac_visits_cases_select_1_no_zero, x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', name = "past data") %>%
add_trace(data = ac_visits_cases_select_1_no_zero, x = ~visits_per_capita, y = fitted(model_visits_cases_1_2), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_visits_cases_1_2)$r.squared), name = "past data fit") %>%
add_trace(data = ac_visits_cases_select_2_no_zero, x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', name = "most recent week") %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily new cases per person'))
Try if we look at growth rate (change divided by previous day’s levels).
ac_visits_cases_select_growth <- ac_visits_cases_select %>%
mutate(past_cases_by_pop = c(NA, ac_visits_cases_select$cases_by_pop[1:(nrow(ac_visits_cases_select)-1)]),
case_growth_daily = change_cases_by_pop*100 / past_cases_by_pop) %>%
filter(!is.na(case_growth_daily))
ac_visits_cases_select_growth %>% plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~case_growth_daily, type = 'scatter', mode = 'markers') %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily growth rate of cases per person'))
This might look a little better.
ac_visits_cases_select_growth_1 <- ac_visits_cases_select_growth %>%
filter(date < (max(date) - 7))
ac_visits_cases_select_growth_2 <- ac_visits_cases_select_growth %>%
filter(date >= (max(date) - 7))
model_visits_cases_growth_1 <- lm(case_growth_daily ~ visits_per_capita, ac_visits_cases_select_growth_1)
summary(model_visits_cases_growth_1)
##
## Call:
## lm(formula = case_growth_daily ~ visits_per_capita, data = ac_visits_cases_select_growth_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.1727 -3.6561 -0.7812 2.1856 20.5504
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.549 4.737 -0.749 0.457
## visits_per_capita 11.867 7.106 1.670 0.101
##
## Residual standard error: 4.904 on 55 degrees of freedom
## Multiple R-squared: 0.04826, Adjusted R-squared: 0.03096
## F-statistic: 2.789 on 1 and 55 DF, p-value: 0.1006
plot_ly() %>%
add_trace(data = ac_visits_cases_select_growth_1, x = ~visits_per_capita, y = ~case_growth_daily, type = 'scatter', mode = 'markers', name = "past data") %>%
add_trace(data = ac_visits_cases_select_growth_1, x = ~visits_per_capita, y = fitted(model_visits_cases_growth_1), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_visits_cases_growth_1)$r.squared), name = "past data fit") %>%
add_trace(data = ac_visits_cases_select_growth_2, x = ~visits_per_capita, y = ~case_growth_daily, type = 'scatter', mode = 'markers', name = "most recent week") %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily growth rate of cases per person'))
Removing the zero growth rate values.
model_visits_cases_growth_1_no_zero <- lm(case_growth_daily ~ visits_per_capita, ac_visits_cases_select_growth_1 %>% filter(case_growth_daily > 0))
summary(model_visits_cases_growth_1_no_zero)
##
## Call:
## lm(formula = case_growth_daily ~ visits_per_capita, data = ac_visits_cases_select_growth_1 %>%
## filter(case_growth_daily > 0))
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.3577 -3.0053 -0.7427 0.4916 18.1492
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.448 5.841 -0.248 0.806
## visits_per_capita 12.313 8.656 1.423 0.164
##
## Residual standard error: 4.648 on 34 degrees of freedom
## Multiple R-squared: 0.05618, Adjusted R-squared: 0.02842
## F-statistic: 2.024 on 1 and 34 DF, p-value: 0.164
plot_ly() %>%
add_trace(data = ac_visits_cases_select_growth_1 %>% filter(case_growth_daily > 0), x = ~visits_per_capita, y = ~case_growth_daily, type = 'scatter', mode = 'markers', name = "past data") %>%
add_trace(data = ac_visits_cases_select_growth_1 %>% filter(case_growth_daily > 0), x = ~visits_per_capita, y = fitted(model_visits_cases_growth_1_no_zero), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_visits_cases_growth_1_no_zero)$r.squared), name = "past data fit") %>%
add_trace(data = ac_visits_cases_select_growth_2 %>% filter(case_growth_daily > 0), x = ~visits_per_capita, y = ~case_growth_daily, type = 'scatter', mode = 'markers', name = "most recent week") %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily growth rate of cases per person'))
That does look at least in the direction we’d expect, but is not significant still.
Try with one more different zip code. (94603)
zip_select <- "94603"
lag <- 14
ac_cases_zip_select <- ac_cases_zip %>% filter(zipcode == zip_select) %>%
mutate(change_cases_by_pop = c(NA, diff(cases_by_pop)))
ac_daily_visits_zip_select <- ac_daily_visits_zip %>% filter(zipcode == zip_select)
ac_daily_visits_lag <- ac_daily_visits_zip_select %>%
ungroup() %>%
dplyr::select(date, total_visits_avg) %>%
mutate(lag_date = date + lag) %>%
dplyr::select(lag_date, total_visits_avg)
ac_visits_cases_select <- left_join(ac_cases_zip_select, ac_daily_visits_lag, by = c("date" = "lag_date")) %>%
filter(cases >= 10) %>% # only select once cases have started to go above the threshold
mutate(visits_per_capita = total_visits_avg / total_pop_zip)
ac_visits_cases_select %>% plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily new cases per person'))
Separating this out into all but the most recent week of data, then adding that in.
ac_visits_cases_select_1 <- ac_visits_cases_select %>%
filter(date < (max(date) - 7))
ac_visits_cases_select_2 <- ac_visits_cases_select %>%
filter(date >= (max(date) - 7))
model_visits_cases_1 <- lm(change_cases_by_pop ~ visits_per_capita, ac_visits_cases_select_1)
summary(model_visits_cases_1)
##
## Call:
## lm(formula = change_cases_by_pop ~ visits_per_capita, data = ac_visits_cases_select_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.824e-05 -4.538e-05 -9.027e-06 3.894e-05 1.880e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.552e-04 6.730e-05 2.307 0.0249 *
## visits_per_capita -1.158e-04 9.786e-05 -1.183 0.2419
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.136e-05 on 55 degrees of freedom
## Multiple R-squared: 0.02481, Adjusted R-squared: 0.007084
## F-statistic: 1.4 on 1 and 55 DF, p-value: 0.2419
plot_ly() %>%
add_trace(data = ac_visits_cases_select_1, x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', name = "past data") %>%
add_trace(data = ac_visits_cases_select_1, x = ~visits_per_capita, y = fitted(model_visits_cases_1), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_visits_cases_1)$r.squared), name = "past data fit") %>%
add_trace(data = ac_visits_cases_select_2, x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', name = "most recent week") %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily new cases per person'))
Look at growth rate (change divided by previous day’s levels).
ac_visits_cases_select_growth <- ac_visits_cases_select %>%
mutate(past_cases_by_pop = c(NA, ac_visits_cases_select$cases_by_pop[1:(nrow(ac_visits_cases_select)-1)]),
case_growth_daily = change_cases_by_pop*100 / past_cases_by_pop) %>%
filter(!is.na(case_growth_daily))
ac_visits_cases_select_growth %>% plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~case_growth_daily, type = 'scatter', mode = 'markers') %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily growth rate of cases per person'))
This might look a little better.
ac_visits_cases_select_growth_1 <- ac_visits_cases_select_growth %>%
filter(date < (max(date) - 7))
ac_visits_cases_select_growth_2 <- ac_visits_cases_select_growth %>%
filter(date >= (max(date) - 7))
model_visits_cases_growth_1 <- lm(case_growth_daily ~ visits_per_capita, ac_visits_cases_select_growth_1)
summary(model_visits_cases_growth_1)
##
## Call:
## lm(formula = case_growth_daily ~ visits_per_capita, data = ac_visits_cases_select_growth_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.0560 -3.3384 -0.9768 1.3702 17.0075
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.724 5.767 -0.993 0.3253
## visits_per_capita 15.864 8.362 1.897 0.0632 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.152 on 54 degrees of freedom
## Multiple R-squared: 0.06249, Adjusted R-squared: 0.04513
## F-statistic: 3.599 on 1 and 54 DF, p-value: 0.06315
plot_ly() %>%
add_trace(data = ac_visits_cases_select_growth_1, x = ~visits_per_capita, y = ~case_growth_daily, type = 'scatter', mode = 'markers', name = "past data") %>%
add_trace(data = ac_visits_cases_select_growth_1, x = ~visits_per_capita, y = fitted(model_visits_cases_growth_1), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_visits_cases_growth_1)$r.squared), name = "past data fit") %>%
add_trace(data = ac_visits_cases_select_growth_2, x = ~visits_per_capita, y = ~case_growth_daily, type = 'scatter', mode = 'markers', name = "most recent week") %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily growth rate of cases per person'))
Removing the zero growth rate values.
model_visits_cases_growth_1_no_zero <- lm(case_growth_daily ~ visits_per_capita, ac_visits_cases_select_growth_1 %>% filter(case_growth_daily > 0))
summary(model_visits_cases_growth_1_no_zero)
##
## Call:
## lm(formula = case_growth_daily ~ visits_per_capita, data = ac_visits_cases_select_growth_1 %>%
## filter(case_growth_daily > 0))
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3748 -3.1423 -0.9239 1.3312 16.3148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.379 5.837 -1.436 0.158
## visits_per_capita 21.301 8.508 2.504 0.016 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.93 on 45 degrees of freedom
## Multiple R-squared: 0.1223, Adjusted R-squared: 0.1028
## F-statistic: 6.268 on 1 and 45 DF, p-value: 0.01599
plot_ly() %>%
add_trace(data = ac_visits_cases_select_growth_1 %>% filter(case_growth_daily > 0), x = ~visits_per_capita, y = ~case_growth_daily, type = 'scatter', mode = 'markers', name = "past data") %>%
add_trace(data = ac_visits_cases_select_growth_1 %>% filter(case_growth_daily > 0), x = ~visits_per_capita, y = fitted(model_visits_cases_growth_1_no_zero), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_visits_cases_growth_1_no_zero)$r.squared), name = "past data fit") %>%
add_trace(data = ac_visits_cases_select_growth_2 %>% filter(case_growth_daily > 0), x = ~visits_per_capita, y = ~case_growth_daily, type = 'scatter', mode = 'markers', name = "most recent week") %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily growth rate of cases per person'))
lag <- 14
ac_cases_zip <- ac_cases_zip %>%
group_by(zipcode) %>%
mutate(change_cases_by_pop = c(NA, diff(cases_by_pop)))
ac_daily_visits_lag <- ac_daily_visits_zip %>%
mutate(lag_date = date + lag) %>%
dplyr::select(-date) %>%
rename(date = lag_date)
ac_visits_cases <- left_join(ac_cases_zip, ac_daily_visits_lag) %>%
filter(cases >= 10) %>% # only select once cases have started to go above the threshold
mutate(visits_per_capita = total_visits_avg / total_pop_zip) %>%
group_by(zipcode) %>%
mutate(visits_mov = movavg(visits_per_capita, 5, type = "s"),
change_cases_by_pop_mov = movavg(change_cases_by_pop, 5, type = "s")) %>%
mutate(case_growth = change_cases_by_pop / lag(cases_by_pop)) %>%
mutate(growth_mov = movavg(case_growth, 5, type = "s")) %>%
filter(!is.na(cases_by_pop) & !is.na(visits_per_capita) & !is.na(case_growth))
ac_visits_cases %>% plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily new cases per person'))
ac_visits_cases %>% plot_ly() %>%
add_trace(x = ~visits_mov, y = ~change_cases_by_pop_mov, type = 'scatter', mode = 'markers') %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag, 5 day moving average'), yaxis = list(title = 'Daily new cases per person, 5 day moving average'))
ac_visits_cases %>% plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~case_growth, type = 'scatter', mode = 'markers') %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag'), yaxis = list(title = 'Daily case growth rate'))
ac_visits_cases %>% plot_ly() %>%
add_trace(x = ~visits_mov, y = ~growth_mov, type = 'scatter', mode = 'markers') %>%
layout(title = paste0("Zip Code ", zip_select), xaxis = list(title = 'Total daily visits per person, 14 day lag, 5 day moving average'), yaxis = list(title = 'Daily case growth rate, 5 day moving average'))
Try separating out by date.
ac_visits_cases_growth_1 <- ac_visits_cases %>%
filter(date < (max(date) - 7))
ac_visits_cases_growth_2 <- ac_visits_cases %>%
filter(date >= (max(date) - 7))
model_visits_cases_1 <- lm(change_cases_by_pop ~ visits_per_capita, ac_visits_cases_growth_1)
summary(model_visits_cases_1)
##
## Call:
## lm(formula = change_cases_by_pop ~ visits_per_capita, data = ac_visits_cases_growth_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.169e-05 -3.061e-05 -1.951e-05 1.170e-05 6.748e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.581e-06 5.304e-06 0.675 0.5
## visits_per_capita 4.186e-05 7.907e-06 5.294 1.32e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.186e-05 on 2108 degrees of freedom
## Multiple R-squared: 0.01312, Adjusted R-squared: 0.01265
## F-statistic: 28.03 on 1 and 2108 DF, p-value: 1.322e-07
model_visits_cases_1_mov <- lm(change_cases_by_pop_mov ~ visits_mov, ac_visits_cases_growth_1 %>% filter(!is.na(change_cases_by_pop_mov) & !is.na(visits_mov)))
summary(model_visits_cases_1_mov)
##
## Call:
## lm(formula = change_cases_by_pop_mov ~ visits_mov, data = ac_visits_cases_growth_1 %>%
## filter(!is.na(change_cases_by_pop_mov) & !is.na(visits_mov)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.962e-05 -2.477e-05 -1.238e-05 1.201e-05 3.258e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.254e-06 4.512e-06 1.829 0.0675 .
## visits_mov 3.854e-05 6.698e-06 5.754 1e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.975e-05 on 2104 degrees of freedom
## Multiple R-squared: 0.01549, Adjusted R-squared: 0.01502
## F-statistic: 33.11 on 1 and 2104 DF, p-value: 1.001e-08
plot_ly() %>%
add_trace(data = ac_visits_cases_growth_1 %>% filter(!is.na(change_cases_by_pop_mov) & !is.na(visits_mov)), x = ~visits_mov, y = ~change_cases_by_pop_mov, type = 'scatter', mode = 'markers', name = "past data") %>%
add_trace(data = ac_visits_cases_growth_1 %>% filter(!is.na(change_cases_by_pop_mov) & !is.na(visits_mov)), x = ~visits_mov, y = fitted(model_visits_cases_1_mov), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_visits_cases_1_mov)$r.squared), name = "past data fit") %>%
add_trace(data = ac_visits_cases_growth_2 %>% filter(!is.na(change_cases_by_pop_mov) & !is.na(visits_mov)), x = ~visits_mov, y = ~change_cases_by_pop_mov, type = 'scatter', mode = 'markers', name = "most recent week") %>%
layout(title = 'Alameda County by zip code', xaxis = list(title = 'Total daily visits per person, 14 day lag, 5 day moving avg'), yaxis = list(title = 'Daily change in cases per person, 5 day moving avg'))
model_visits_cases_growth_1 <- lm(case_growth ~ visits_per_capita, ac_visits_cases_growth_1)
summary(model_visits_cases_growth_1)
##
## Call:
## lm(formula = case_growth ~ visits_per_capita, data = ac_visits_cases_growth_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.09615 -0.03170 -0.01920 0.01414 0.52340
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.011050 0.005615 -1.968 0.0492 *
## visits_per_capita 0.065891 0.008371 7.871 5.56e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0549 on 2108 degrees of freedom
## Multiple R-squared: 0.02855, Adjusted R-squared: 0.02809
## F-statistic: 61.96 on 1 and 2108 DF, p-value: 5.556e-15
model_visits_cases_growth_1_mov <- lm(growth_mov ~ visits_mov, ac_visits_cases_growth_1 %>% filter(!is.na(growth_mov) & !is.na(visits_mov)))
summary(model_visits_cases_growth_1_mov)
##
## Call:
## lm(formula = growth_mov ~ visits_mov, data = ac_visits_cases_growth_1 %>%
## filter(!is.na(growth_mov) & !is.na(visits_mov)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.039150 -0.020670 -0.009317 0.009959 0.185408
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.002746 0.004551 -0.603 0.546
## visits_mov 0.051473 0.006851 7.513 8.75e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03266 on 1950 degrees of freedom
## Multiple R-squared: 0.02813, Adjusted R-squared: 0.02763
## F-statistic: 56.45 on 1 and 1950 DF, p-value: 8.748e-14
plot_ly() %>%
add_trace(data = ac_visits_cases_growth_1 %>% filter(!is.na(growth_mov) & !is.na(visits_mov)), x = ~visits_mov, y = ~growth_mov, type = 'scatter', mode = 'markers', name = "past data") %>%
add_trace(data = ac_visits_cases_growth_1 %>% filter(!is.na(growth_mov) & !is.na(visits_mov)), x = ~visits_mov, y = fitted(model_visits_cases_growth_1_mov), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_visits_cases_growth_1_mov)$r.squared), name = "past data fit") %>%
add_trace(data = ac_visits_cases_growth_2 %>% filter(!is.na(growth_mov) & !is.na(visits_mov)), x = ~visits_mov, y = ~growth_mov, type = 'scatter', mode = 'markers', name = "most recent week") %>%
layout(title = 'Alameda County by zip code', xaxis = list(title = 'Total daily visits per person, 14 day lag, 5 day moving avg'), yaxis = list(title = 'Daily growth rate of cases per person, 5 day moving avg'))
Removing zero values.
ac_visits_cases_growth_1_no_zero <- ac_visits_cases %>%
filter(date < (max(date) - 7)) %>%
filter(case_growth > 0)
ac_visits_cases_growth_2_no_zero <- ac_visits_cases %>%
filter(date >= (max(date) - 7)) %>%
filter(case_growth > 0)
model_visits_cases_growth_1_no_zero <- lm(case_growth ~ visits_per_capita, ac_visits_cases_growth_1_no_zero)
summary(model_visits_cases_growth_1_no_zero)
##
## Call:
## lm(formula = case_growth ~ visits_per_capita, data = ac_visits_cases_growth_1_no_zero)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.06394 -0.03786 -0.01594 0.01551 0.49307
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.037588 0.009752 3.855 0.000123 ***
## visits_per_capita 0.037966 0.013937 2.724 0.006552 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06412 on 1064 degrees of freedom
## Multiple R-squared: 0.006926, Adjusted R-squared: 0.005993
## F-statistic: 7.421 on 1 and 1064 DF, p-value: 0.006552
model_visits_cases_growth_1_mov_no_zero <- lm(growth_mov ~ visits_mov, ac_visits_cases_growth_1_no_zero %>% filter(!is.na(growth_mov) & !is.na(visits_mov)))
summary(model_visits_cases_growth_1_mov_no_zero)
##
## Call:
## lm(formula = growth_mov ~ visits_mov, data = ac_visits_cases_growth_1_no_zero %>%
## filter(!is.na(growth_mov) & !is.na(visits_mov)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.04488 -0.02412 -0.01060 0.01222 0.17525
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.017395 0.008185 2.125 0.03382 *
## visits_mov 0.038586 0.011860 3.253 0.00118 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03706 on 988 degrees of freedom
## Multiple R-squared: 0.0106, Adjusted R-squared: 0.009598
## F-statistic: 10.58 on 1 and 988 DF, p-value: 0.001179
plot_ly() %>%
add_trace(data = ac_visits_cases_growth_1_no_zero %>% filter(!is.na(growth_mov) & !is.na(visits_mov)), x = ~visits_mov, y = ~growth_mov, type = 'scatter', mode = 'markers', name = "past data") %>%
add_trace(data = ac_visits_cases_growth_1_no_zero %>% filter(!is.na(growth_mov) & !is.na(visits_mov)), x = ~visits_mov, y = fitted(model_visits_cases_growth_1_mov_no_zero), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_visits_cases_growth_1_mov_no_zero)$r.squared), name = "past data fit") %>%
add_trace(data = ac_visits_cases_growth_2_no_zero %>% filter(!is.na(growth_mov) & !is.na(visits_mov)), x = ~visits_mov, y = ~growth_mov, type = 'scatter', mode = 'markers', name = "most recent week") %>%
layout(title = 'Alameda County by zip code, zeros removed', xaxis = list(title = 'Total daily visits per person, 14 day lag, 5 day moving avg'), yaxis = list(title = 'Daily growth rate of cases per person, 5 day moving avg'))
start_date_weeks <- as.Date("2020-03-28")
ac_visits_cases_weekly <- ac_visits_cases %>%
mutate(week = floor((date - start_date_weeks) / 7) + 1) %>%
group_by(zipcode, week) %>%
summarize(final_cases_by_pop_week = max(cases_by_pop),
total_visits_per_capita_week = sum(visits_per_capita)) %>%
group_by(zipcode) %>%
mutate(change_cases_by_pop_week = c(NA, diff(final_cases_by_pop_week)),
case_growth_week = change_cases_by_pop_week / lag(final_cases_by_pop_week),
log_cases_by_pop_week = log(final_cases_by_pop_week),
change_log_cases_by_pop_week = c(NA, diff(log_cases_by_pop_week))) %>%
filter(!is.na(case_growth_week)) %>%
filter(week < max(week)) # remove last week since it wasn't a full week
ac_visits_cases_weekly %>% plot_ly() %>%
add_trace(x = ~total_visits_per_capita_week, y = ~change_cases_by_pop_week, type = 'scatter', mode = 'markers', color = ~zipcode, text = ~week) %>%
layout(title = "Alameda County", xaxis = list(title = 'Total weekly visits per person, 2 week lag'), yaxis = list(title = 'Weekly change in cases per person'))
ac_visits_cases_weekly %>% plot_ly() %>%
add_trace(x = ~total_visits_per_capita_week, y = ~case_growth_week, type = 'scatter', mode = 'markers', color = ~zipcode, text = ~week) %>%
layout(title = "Alameda County", xaxis = list(title = 'Total weekly visits per person, 2 week lag'), yaxis = list(title = 'Weekly growth rate of cases per person'))
Look at the week 10 (latest full week) data when we fit a trendline to all other week data.
ac_visits_cases_weekly_1 <- ac_visits_cases_weekly %>%
filter(week < max(week))
ac_visits_cases_weekly_2 <- ac_visits_cases_weekly %>%
filter(week == max(week))
model_week_1_abs <- lm(change_cases_by_pop_week ~ total_visits_per_capita_week, ac_visits_cases_weekly_1)
summary(model_week_1_abs)
##
## Call:
## lm(formula = change_cases_by_pop_week ~ total_visits_per_capita_week,
## data = ac_visits_cases_weekly_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.741e-04 -1.502e-04 -6.302e-05 6.679e-05 1.820e-03
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.584e-05 1.071e-04 -0.895 0.37163
## total_visits_per_capita_week 6.869e-05 2.317e-05 2.964 0.00331 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0002618 on 265 degrees of freedom
## Multiple R-squared: 0.03209, Adjusted R-squared: 0.02844
## F-statistic: 8.787 on 1 and 265 DF, p-value: 0.00331
plot_ly() %>%
add_trace(data = ac_visits_cases_weekly_1, x = ~total_visits_per_capita_week, y = ~change_cases_by_pop_week, type = 'scatter', mode = 'markers', name = "past data", text = ~week) %>%
add_trace(data = ac_visits_cases_weekly_1 , x = ~total_visits_per_capita_week, y = fitted(model_week_1_abs), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_week_1_abs)$r.squared), name = "past data fit") %>%
add_trace(data = ac_visits_cases_weekly_2, x = ~total_visits_per_capita_week, y = ~change_cases_by_pop_week, type = 'scatter', mode = 'markers', name = "most recent week", text = ~week) %>%
layout(title = 'Alameda County by zip code', xaxis = list(title = 'Total weekly visits per person, 2 week lag'), yaxis = list(title = 'Weekly change in cases per person'))
model_week_1 <- lm(case_growth_week ~ total_visits_per_capita_week, ac_visits_cases_weekly_1)
summary(model_week_1)
##
## Call:
## lm(formula = case_growth_week ~ total_visits_per_capita_week,
## data = ac_visits_cases_weekly_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.28221 -0.14975 -0.07356 0.07097 1.47371
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.06530 0.10674 -0.612 0.54124
## total_visits_per_capita_week 0.06452 0.02310 2.794 0.00559 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.261 on 265 degrees of freedom
## Multiple R-squared: 0.02861, Adjusted R-squared: 0.02494
## F-statistic: 7.804 on 1 and 265 DF, p-value: 0.005594
plot_ly() %>%
add_trace(data = ac_visits_cases_weekly_1, x = ~total_visits_per_capita_week, y = ~case_growth_week, type = 'scatter', mode = 'markers', name = "past data", text = ~week) %>%
add_trace(data = ac_visits_cases_weekly_1 , x = ~total_visits_per_capita_week, y = fitted(model_week_1), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_week_1)$r.squared), name = "past data fit") %>%
add_trace(data = ac_visits_cases_weekly_2, x = ~total_visits_per_capita_week, y = ~case_growth_week, type = 'scatter', mode = 'markers', name = "most recent week", text = ~week) %>%
layout(title = 'Alameda County by zip code', xaxis = list(title = 'Total weekly visits per person, 2 week lag'), yaxis = list(title = 'Weekly growth rate of cases per person'))
Remove zeros.
model_week_1_no_zeros_abs <- lm(change_cases_by_pop_week ~ total_visits_per_capita_week, ac_visits_cases_weekly_1 %>% filter(change_cases_by_pop_week > 0))
summary(model_week_1_no_zeros_abs)
##
## Call:
## lm(formula = change_cases_by_pop_week ~ total_visits_per_capita_week,
## data = ac_visits_cases_weekly_1 %>% filter(change_cases_by_pop_week >
## 0))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.652e-04 -1.547e-04 -7.274e-05 8.047e-05 1.796e-03
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.756e-05 1.193e-04 -0.315 0.7532
## total_visits_per_capita_week 5.949e-05 2.563e-05 2.321 0.0211 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0002663 on 244 degrees of freedom
## Multiple R-squared: 0.02161, Adjusted R-squared: 0.0176
## F-statistic: 5.389 on 1 and 244 DF, p-value: 0.02109
plot_ly() %>%
add_trace(data = ac_visits_cases_weekly_1 %>% filter(change_cases_by_pop_week > 0), x = ~total_visits_per_capita_week, y = ~change_cases_by_pop_week, type = 'scatter', mode = 'markers', name = "past weeks", text = ~week) %>%
add_trace(data = ac_visits_cases_weekly_1 %>% filter(change_cases_by_pop_week > 0), x = ~total_visits_per_capita_week, y = fitted(model_week_1_no_zeros_abs), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_week_1_no_zeros_abs)$r.squared), name = "past week fit") %>%
add_trace(data = ac_visits_cases_weekly_2 %>% filter(change_cases_by_pop_week > 0), x = ~total_visits_per_capita_week, y = ~change_cases_by_pop_week, type = 'scatter', mode = 'markers', name = "most recent week", text = ~week) %>%
layout(title = 'Alameda County by zip code, zeros removed', xaxis = list(title = 'Total weekly visits per person, 2 week lag'), yaxis = list(title = 'Weekly change in cases per person'))
model_week_1_no_zeros <- lm(case_growth_week ~ total_visits_per_capita_week, ac_visits_cases_weekly_1 %>% filter(case_growth_week > 0))
summary(model_week_1_no_zeros)
##
## Call:
## lm(formula = case_growth_week ~ total_visits_per_capita_week,
## data = ac_visits_cases_weekly_1 %>% filter(case_growth_week >
## 0))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.27649 -0.15046 -0.08185 0.06625 1.46041
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.00291 0.11853 0.025 0.9804
## total_visits_per_capita_week 0.05341 0.02545 2.099 0.0369 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2645 on 244 degrees of freedom
## Multiple R-squared: 0.01773, Adjusted R-squared: 0.0137
## F-statistic: 4.404 on 1 and 244 DF, p-value: 0.03688
plot_ly() %>%
add_trace(data = ac_visits_cases_weekly_1 %>% filter(case_growth_week > 0), x = ~total_visits_per_capita_week, y = ~case_growth_week, type = 'scatter', mode = 'markers', name = "past weeks", text = ~week) %>%
add_trace(data = ac_visits_cases_weekly_1 %>% filter(case_growth_week > 0), x = ~total_visits_per_capita_week, y = fitted(model_week_1_no_zeros), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_week_1_no_zeros)$r.squared), name = "past week fit") %>%
add_trace(data = ac_visits_cases_weekly_2 %>% filter(case_growth_week > 0), x = ~total_visits_per_capita_week, y = ~case_growth_week, type = 'scatter', mode = 'markers', name = "most recent week", text = ~week) %>%
layout(title = 'Alameda County by zip code, zeros removed', xaxis = list(title = 'Total weekly visits per person, 2 week lag'), yaxis = list(title = 'Weekly growth rate of cases per person'))
Try with change in log of cases.
model_week_1_log <- lm(change_log_cases_by_pop_week ~ total_visits_per_capita_week, ac_visits_cases_weekly_1)
summary(model_week_1_log)
##
## Call:
## lm(formula = change_log_cases_by_pop_week ~ total_visits_per_capita_week,
## data = ac_visits_cases_weekly_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.22661 -0.11383 -0.04483 0.06932 0.79698
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.01958 0.07059 -0.277 0.78173
## total_visits_per_capita_week 0.04571 0.01527 2.993 0.00303 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1726 on 265 degrees of freedom
## Multiple R-squared: 0.03269, Adjusted R-squared: 0.02904
## F-statistic: 8.957 on 1 and 265 DF, p-value: 0.003025
plot_ly() %>%
add_trace(data = ac_visits_cases_weekly_1, x = ~total_visits_per_capita_week, y = ~change_log_cases_by_pop_week, type = 'scatter', mode = 'markers', name = "past data", text = ~week) %>%
add_trace(data = ac_visits_cases_weekly_1 , x = ~total_visits_per_capita_week, y = fitted(model_week_1_log), type = 'scatter', mode = 'lines', text = paste("R squared:", summary.lm(model_week_1_log)$r.squared), name = "past data fit") %>%
add_trace(data = ac_visits_cases_weekly_2, x = ~total_visits_per_capita_week, y = ~change_log_cases_by_pop_week, type = 'scatter', mode = 'markers', name = "most recent week", text = ~week) %>%
layout(title = 'Alameda County by zip code', xaxis = list(title = 'Total weekly visits per person, 2 week lag'), yaxis = list(title = 'Weekly change in log(cases per person)'))